import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy import stats as st


df_4 = pd.read_csv('data_4.csv')


df_4.head()


df_4 = pd.melt(df_4, id_vars=['Unnamed: 0'], var_name='Thickness')
df_4 = df_4.rename(columns={"Unnamed: 0": "Mass","value":'Plasma_Concentration'})


df_4.head()


df_8 = pd.read_csv('data_8.csv')


df_8.head()


df_8 = pd.melt(df_8, id_vars=['Unnamed: 0'], var_name='Thickness')
df_8 = df_8.rename(columns={"Unnamed: 0": "Mass","value":'Plasma_Concentration'})


df_8.head()


df_4.columns

Index(['Mass', 'Thickness', 'Plasma_Concentration'], dtype='object')


df_4.boxplot(column=['Plasma_Concentration'],by = 'Mass',figsize = (5,6))
plt.show()


df_4.boxplot(column=['Plasma_Concentration'],by = 'Thickness',figsize = (5,6))
plt.show()


df_8.boxplot(column=['Plasma_Concentration'],by = 'Mass',figsize = (5,6))
plt.show()


df_8.boxplot(column=['Plasma_Concentration'],by = 'Thickness',figsize = (5,6))
plt.show()


model = ols('Plasma_Concentration ~ C(Mass) + C(Thickness) + C(Mass):C(Thickness)', data=df_4).fit()
sm.stats.anova_lm(model, typ=2)


model = ols('Plasma_Concentration ~ C(Mass) + C(Thickness)', data=df_8).fit()
sm.stats.anova_lm(model, typ=2)


df = pd.read_excel('Biocharcoals.xlsx')


df.head()


df = df.dropna()
df.head()


df_grouped = df.groupby('Treatment', as_index = False)['Grav'].agg({'mean','std'},axis = 0).reset_index(level=[0])


df_grouped


df_grouped.set_index('Treatment').plot.bar(color = ['b','r'],rot=45)
plt.title("Mean and Standard Deviation of Grav by Treatment")
plt.ylabel('Values')
plt.show()


df['Soil'].value_counts()

Chehalis      108
Willamette    108
Name: Soil, dtype: int64


Chehalis_idx = df["Soil"] == "Chehalis"
Willamette_idx = df["Soil"] == "Willamette"


Chehalis_Grav = df[Chehalis_idx]["Grav"]
Willamette_Grav = df[Willamette_idx]["Grav"]


stat, pval = st.f_oneway(Chehalis_Grav,Willamette_Grav)
print('p-values',pval)
if pval < 0.05:    # alpha value is 0.05 or 5%
   print("Null Hypothesis is Rejected")
else:
  print("Failed to Reject Null Hypothesis")

p-values 2.6477869219833055e-38
Null Hypothesis is Rejected


df['Biochar'].value_counts()

350     54
500     54
700     54
None    54
Name: Biochar, dtype: int64


idx_350 = df["Biochar"] == 350
idx_500 = df["Biochar"] == 500
idx_700 = df["Biochar"] == 700
idx_None = df["Biochar"] == 'None'


Grav_350 = df[idx_350]["Grav"]
Grav_500 = df[idx_500]["Grav"]
Grav_700 = df[idx_700]["Grav"]
Grav_None = df[idx_None]["Grav"]


stat, pval = st.f_oneway(Grav_350,Grav_500,Grav_700,Grav_None)
print('p-values',pval)
if pval < 0.05:    # alpha value is 0.05 or 5%
   print("Null Hypothesis is Rejected")
else:
  print("Failed to Reject Null Hypothesis")

p-values 0.06719121812784043
Failed to Reject Null Hypothesis


model = ols('Grav ~ C(Soil) + C(Biochar) + C(Soil):C(Biochar)', data=df).fit()


sm.stats.anova_lm(model, typ=2)


tukey = pairwise_tukeyhsd(endog=df['Grav'],groups=df['Soil'],alpha=0.05)

print(tukey)

   Multiple Comparison of Means - Tukey HSD, FWER=0.05   
=========================================================
 group1    group2   meandiff p-adj  lower   upper  reject
---------------------------------------------------------
Chehalis Willamette  -0.0786 0.001 -0.0883 -0.0689   True
---------------------------------------------------------


biochar = df['Biochar'].astype(str)


tukey = pairwise_tukeyhsd(endog=df['Grav'],groups=biochar,alpha=0.05)

print(tukey)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
===================================================
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
   350    500    0.007    0.9 -0.0194 0.0333  False
   350    700   0.0043    0.9  -0.022 0.0307  False
   350   None  -0.0179 0.2977 -0.0442 0.0085  False
   500    700  -0.0027    0.9  -0.029 0.0237  False
   500   None  -0.0248 0.0727 -0.0512 0.0015  False
   700   None  -0.0222  0.132 -0.0485 0.0042  False
---------------------------------------------------

	Unnamed: 0	t1	t2	t3	t4
0	c1	0.0208	0.3208	0.4208	0.7208
1	c1	0.0000	0.1436	0.2436	0.5436
2	c1	0.4664	0.7664	0.8664	1.1664
3	c1	0.0000	0.0000	0.0000	0.0000
4	c1	1.9642	2.2642	2.3642	2.6642

	Mass	Thickness	Plasma_Concentration
0	c1	t1	0.0208
1	c1	t1	0.0000
2	c1	t1	0.4664
3	c1	t1	0.0000
4	c1	t1	1.9642

	Unnamed: 0	t1	t2	t3	t4
0	c1	0.9201	1.2201	1.3201	1.6201
1	c1	0.3800	0.6800	0.7800	1.0800
2	c1	0.3652	0.6652	0.7652	1.0652
3	c1	0.0000	0.0000	0.0018	0.3018
4	c1	1.4187	1.7187	1.8187	2.1187

	Mass	Thickness	Plasma_Concentration
0	c1	t1	0.9201
1	c1	t1	0.3800
2	c1	t1	0.3652
3	c1	t1	0.0000
4	c1	t1	1.4187

	sum_sq	df	F	PR(>F)
C(Mass)	20.702632	3.0	6.368337	0.000761
C(Thickness)	4.075817	3.0	1.253762	0.297774
C(Mass):C(Thickness)	0.054839	9.0	0.005623	1.000000
Residual	69.351877	64.0	NaN	NaN

Purpose¶

analysis of variance.¶

Instructions¶

Problem 1¶

Create a box and whisker plot for each factor at each time. (Hint: There will only be two plots for each time set. You do not need to separate out each combination of factor levels. Four box-plots total.)¶

Perform a two-way analysis of variance to determine the significance of each factor at each point in time. In other words, perform the ANOVA on the data set at 4 hours and also the data set at 8 hours.¶

4 Hours¶

8 Hours¶

Analyze the results of the ANOVA tests to determine if each factor has a statistically significant influence on the plasma concentration at that time.¶

If the goal was to achieve the highest blood concentration at any time, which combination of factor levels would you choose?¶

If the goal was to achieve the highest blood concentration at 8 hours, which combination of factor levels would you choose?¶

If the goal was to achieve the most consistent blood concentration over time, which combination of factor levels would you choose?¶

Problem 2:¶

1. Import the file "Biocharcolas.xlsx" you used for the Midterm Project into a Jupyter notebook. Plot the treatment means and standard deviations for the "Grav" variable¶

2. Perform a 1-way ANOVA for the factor "Soil" (HINT: check out the df.dropna() function)¶

Perform a 1-way ANOVA for the factor "Biochar"¶

Perform a 2-way ANOVA for the factors "Soil" and "Biochar" with the interaction¶

Perform a Tukey's HSD posthoc test on your 2-way ANOVA (Hint: first use the df.dropna() function again, but only using the "Grav" column)¶

	sum_sq	df	F	PR(>F)
C(Mass)	3.670774	3.0	3.130997	0.030712
C(Thickness)	4.584936	3.0	3.910734	0.011984
Residual	28.528345	73.0	NaN	NaN

	Soil	Biochar	Replicate	Treatment	day	Temp	DeltaCO2	FractionCO2	Grav	Nitrate
0	Chehalis	350	1	Chehalis.350	0	NaN	NaN	NaN	0.427248	5.079171
1	Chehalis	350	1	Chehalis.350	1	24.30	0.005872	0.707971	0.420356	11.092042
2	Chehalis	350	1	Chehalis.350	2	24.60	0.006985	0.642727	0.419754	10.118050
3	Chehalis	350	1	Chehalis.350	3	25.15	0.004096	0.458804	0.422973	10.903952
4	Chehalis	350	1	Chehalis.350	4	25.20	0.003023	0.356825	0.424141	12.643656

	Treatment	mean	std
0	Chehalis.0	0.396191	0.025297
1	Chehalis.350	0.431703	0.029243
2	Chehalis.500	0.443616	0.026858
3	Chehalis.700	0.438545	0.060386
4	Willamette.0	0.347809	0.046381
5	Willamette.350	0.348051	0.021975
6	Willamette.500	0.350081	0.020354
7	Willamette.700	0.349836	0.020466

	sum_sq	df	F	PR(>F)
C(Soil)	0.333354	1.0	285.829386	6.400521e-41
C(Biochar)	0.020300	3.0	5.801913	7.934483e-04
C(Soil):C(Biochar)	0.017062	3.0	4.876608	2.680177e-03
Residual	0.242584	208.0	NaN	NaN